Machine Learning Introduction



In [ ]:

    
from IPython.display import Image, display, HTML
Image("images/munich.jpg")



In [ ]:

    
display(HTML("<table><tr><td><p><b>Rain Princess - Leonid Afremov</b></p><img src='images/princess.jpeg'></td><td><b><p>Munich + Rain Princess + Machine Learning</b></p><img src='images/munich-princess-out.jpg'></td></tr></table>"))



In [ ]:

    
display(HTML("<table><tr><td><p><b>The Great Wave off Kanagawa - Katsushika Hokusai</b></p><img src='images/wave.jpg'></td><td><b><p>Munich + The Great Wave + Machine Learning</b></p><img src='images/munich-wave-out.jpg'></td></tr></table>"))



In [ ]:

    
display(HTML("<table><tr><td><p><b>La Muse - Pablo Picaso</b></p><img src='images/muse.jpg'></td><td><b><p>Munich + La Muse + Machine Learning</b></p><img src='images/munich-muse-out.jpg'></td></tr></table>"))



In [ ]:

    
display(HTML("<table><tr><td><p><b>Udnie - Francis Picabia</b></p><img src='images/udnie.jpg'></td><td><b><p>Munich + Udnie + Machine Learning</b></p><img src='images/munich-udnie-out.jpg'></td></tr></table>"))



In [ ]:

    
display(HTML("<table><tr><td><b><p>Scream - Edvard Munch</b></p><img src='images/scream.jpg'></td><td><b><p>Munich + Scream + Machine Learning</b></p><img src='images/munich-scream-out.jpg'></td></tr></table>"))



In [ ]:

    
display(HTML("<table><tr><td><p><b>The Shipwreck of the Minotaur - Joseph Mallord William Turner</b></p><img src='images/wreck.jpg'></td><td><b><p>Munich + Shipwreck + Machine Learning</b></p><img src='images/munich-wreck-out.jpg'></td></tr></table>"))



In [ ]:

    
# A bit about MNIST dataset
from tensorflow.examples.tutorials.mnist import input_data
data = input_data.read_data_sets("data/MNIST/", one_hot=True)
import numpy as np
from scipy.stats import norm
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
from pandas.tools.plotting import scatter_matrix
import seaborn as sns



In [ ]:

    
%matplotlib inline
data.test.cls = np.array([label.argmax() for label in data.test.labels])
# We know that MNIST images are 28 pixels in each dimension.
img_size = 28
# Images are stored in one-dimensional arrays of this length.
img_size_flat = img_size * img_size
# Tuple with height and width of images used to reshape arrays.
img_shape = (img_size, img_size)
# Number of classes, one class for each of 10 digits.
num_classes = 10
def plot_images(images, cls_true, cls_pred=None):
    assert len(images) == len(cls_true) == 9
    
    # Create figure with 3x3 sub-plots.
    fig, axes = plt.subplots(3, 3)
    fig.subplots_adjust(hspace=0.3, wspace=0.3)

    for i, ax in enumerate(axes.flat):
        # Plot image.
        ax.imshow(images[i].reshape(img_shape), cmap='binary')

        # Show true and predicted classes.
        if cls_pred is None:
            xlabel = "True: {0}".format(cls_true[i])
        else:
            xlabel = "True: {0}, Pred: {1}".format(cls_true[i], cls_pred[i])

        ax.set_xlabel(xlabel)
        
        # Remove ticks from the plot.
        ax.set_xticks([])
        ax.set_yticks([])
# Get the first images from the test-set.
images = data.test.images[0:9]
# Get the true classes for those images.
cls_true = data.test.cls[0:9]
# Plot the images and labels using our helper-function above.
plot_images(images=images, cls_true=cls_true)

References:

Python basics

Variables



In [ ]:

    
# String
string = 'Machine learning '
string2 = ' dojo '
string3 = ' part I'
print string + string2 + string3
print 'String variable type is: {}'.format(type(string))



In [ ]:

    
# Integers
number = 10
number2 = 20
number3 = 30
print number + number2 + number3
print 'number variable type is: {}'.format(type(number))



In [ ]:

    
# Booleans
boolean = True
boolean2 = True
boolean3 = False
print boolean and boolean2 or boolean3
print 'bolean variable type is: {}'.format(type(boolean))



In [ ]:

    
# Floating point numbers
floating = 3.14
floating2 = 2.79
floating3 = 10.01
print floating + floating2 + floating3
print 'floating variable type is: {}'.format(type(floating))

Conditional statements



In [ ]:

    
if 10 > 8:
    print '10 is greater than 8.'
    print '10 is greater than 8.'
    print '10 is greater than 8.'



In [ ]:

    
a = True
b = 10
c = 20
print 'first if statement...'
if b < c and a:
    print 'All fine.'
else:
    print 'Not all fine.'

print 'second if statement...'
if b < c and (not a):
    print 'All fine.'
else:
    print 'Not all fine.'



In [ ]:

    
if 10 > 20:
    message = "if only 10 were greater than 20"
elif 10 > 30:
    message = "elif means 'else if'"
else:
    message = "when all else fails use else " 
message

Loops



In [ ]:

    
for i in [1, 2, 3, 4, 5]:
    print i



In [ ]:

    
for x in range(5):
    if x == 3:
        continue  # go immediately to the next iteration
    if x == 5:
        break     # quit the loop entirely
    print x



In [ ]:

    
x = 0
while x < 5:
    print x, "is less than 5"
    x += 1



In [ ]:

    
a = True
x = 0
while a:
    print x, "is less than 10"
    x += 1
    if x >= 10:
        a = False

Data structures



In [215]:

    
# Lists
numbers = [1, 4, 9, 16, 25]



In [ ]:

    
numbers[:]



In [ ]:

    
numbers[:2]



In [ ]:

    
numbers[2:]



In [ ]:

    
type(numbers)



In [ ]:

    
letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g']
len(letters)



In [ ]:

    
letters[2]



In [ ]:

    
a = [66.25, 333, 333, 1, 1234.5]
a



In [ ]:

    
a.count(333), a.count(66.25), a.count('x')



In [ ]:

    
a.insert(2, -1)
a



In [ ]:

    
a.append(333)
a



In [ ]:

    
a.index(333)



In [ ]:

    
a.remove(333)
a



In [ ]:

    
a.reverse()
a



In [ ]:

    
a.sort()
a



In [ ]:

    
a.pop()



In [ ]:

    
a



In [232]:

    
# dictionaires
phones = {'Spiderman': 151984858, 'Me': 151234324}



In [ ]:

    
phones['Superman'] = 15104928
phones



In [ ]:

    
phones['Spiderman']



In [ ]:

    
del phones['Me']
phones



In [ ]:

    
phones['Batman'] = 15123545
phones



In [ ]:

    
phones.keys()



In [ ]:

    
'Ken' in phones



In [215]:

    
# tuples



In [ ]:

    
tuple = 31213, 123453, 'hi Ml!'
tuple



In [ ]:

    
tuple[0]



In [ ]:

    
tuple[2]



In [ ]:

    
tuple[1] = 1234



In [ ]:

    
tupleTheSecond = tuple, (1, 2, 3, 4, 5)
tupleTheSecond



In [244]:

    
t1, t2 = tupleTheSecond



In [ ]:

    
t1



In [ ]:

    
t2



In [ ]:

    
for i, j in zip (t1, t2):
    print i, j



In [ ]:

    
type(t1)



In [ ]:

    
# sets



In [249]:

    
basket = ['apple', 'orange', 'apple', 'pear', 'orange', 'banana']



In [250]:

    
fruit = set(basket)



In [ ]:

    
fruit



In [ ]:

    
'orange' in fruit



In [ ]:

    
'plum' in fruit

Pandas basics



In [254]:

    
# import panda library 
import pandas as pd



In [ ]:

    
# Show version of panda library
print pd.__version__



In [ ]:

    
# it is all about describing the data
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import numpy as np

def randrange(n, vmin, vmax):
    '''
    Helper function to make an array of random numbers having shape (n, )
    with each number distributed Uniform(vmin, vmax).
    '''
    return (vmax - vmin)*np.random.rand(n) + vmin

fig = plt.figure(figsize=(14, 12))
ax = fig.add_subplot(111, projection='3d')

n = 100

# For each set of style and range settings, plot n random points in the box
# defined by x in [23, 32], y in [0, 100], z in [zlow, zhigh].
for c, m, zlow, zhigh in [('r', 'o', -50, -25), ('b', '^', -30, -5)]:
    xs = randrange(n, 23, 32)
    ys = randrange(n, 0, 100)
    zs = randrange(n, zlow, zhigh)
    ax.scatter(xs, ys, zs, c=c, marker=m)

ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
ax.set_zlabel('Z Label')

plt.show()

House Sales in King County, USA

Dataset features are selfexplanatory. Dataset is taken from Kaggle website



In [257]:

    
# read csv file
nn = pd.read_csv('kc_house_data.csv')



In [ ]:

    
# top 5 data records
nn.head(10)



In [ ]:

    
# check are there any null values in any of the columns
nn.isnull().any()

len(nn)



In [262]:

    
# add one record with NaN values
nn = nn.append({'id':'12345', 'price':'12345.23'}, ignore_index=True)



In [ ]:

    
len(nn)



In [ ]:

    
# check number of NaN values in some column
len(nn[nn.bedrooms.isnull()])



In [ ]:

    
# show list of the records where column bedrooms contain NaN values
nn[nn.bedrooms.isnull()]



In [266]:

    
# drop NaN values
nn = nn.dropna()



In [ ]:

    
# check number of NaN records after droping NaNs 
len(nn[nn.bedrooms.isnull()])

len(nn)



In [ ]:

    
nn.describe()

Adding new columns



In [ ]:

    
foot_to_meter_ratio = 0.092903
nn['sqm2_living']=nn['sqft_living'] * foot_to_meter_ratio
nn['sqm2_living'] = nn['sqm2_living'].round(0)

nn['sqm2_lot']=nn['sqft_lot'] * foot_to_meter_ratio
nn['sqm2_lot'] = nn['sqm2_lot'].round(0)

# show all columns
pd.set_option("display.max_columns",99)
pd.set_option("display.max_rows",999)

nn.head()



In [ ]:

    
nn['sqm2_basement'] = nn['sqft_basement'].map(lambda x: round(x * foot_to_meter_ratio, 0))
nn.head()



In [ ]:

    
nn['price_low'] = 0
condition = nn['price'] < 100000
nn.loc[condition, 'price_low'] = 1
nn.loc[~condition, 'price_low'] = 0
nn['price_low'].value_counts()



In [ ]:

    
new = nn[(nn['price'] < 100000)] 
new



In [ ]:

    
nn['bedrooms'].value_counts()



In [ ]:

    
counts = nn.groupby('bedrooms').size()
counts



In [ ]:

    
# check waterfront column values
nn['waterfront'].value_counts()



In [ ]:

    
# select all properties with waterfront
waterfront = nn[(nn['waterfront'] == 1)]
waterfront



In [ ]:

    
waterfront_1_room = nn[(nn['waterfront'] == 1) & (nn['bedrooms'] == 1)]
waterfront_1_room



In [ ]:

    
waterfront.describe()

Histograms - data distributions



In [303]:

    
plt.figure(figsize=(10, 5))
plt.hist(nn['bedrooms'],normed=False) 
plt.show()



In [ ]:

    
plt.figure(figsize=(10, 5))
plt.hist(nn['price'],normed=False)      
plt.show()



In [ ]:

    
plt.figure(figsize=(10, 5))
plt.hist(nn['sqft_living'],normed=False)      
plt.show()



In [ ]:

    
plt.figure(figsize=(10, 5))
plt.hist(nn['sqft_lot'],normed=False)      
plt.show()

Miscellaneous



In [ ]:

    
def colorFunction(x):
    if x == 0:
        return 'black'
    elif x == 1:
        return 'brown'
    elif x == 2:
        return 'red'
    elif x == 3:
        return 'blue'
    elif x == 4:
        return 'green'
    elif x == 5:
        return 'pink'
    elif x == 6:
        return 'orange'
    elif x ==7:
        return 'cyan'
    elif x ==8:
        return 'yellow'
    elif x == 9:
        return 'magenta'
    else:
        return 'pink'
    
nn['color'] = nn['bedrooms'].apply(colorFunction)

figure = plt.figure()
subplot = figure.add_subplot(111)
scatter = subplot.scatter(nn['long'], nn['lat'], s=10, c=nn['color'])
subplot.set_xlabel('Longitude')
subplot.set_ylabel('Latitude')
figure.set_figheight(10)
figure.set_figwidth(15)
plt.show()



In [ ]:

    
features = nn.drop(['id','price','date','color'], axis = 1)

# Using pyplot
plt.figure(figsize=(20, 55))

# i: index
for i, col in enumerate(features.columns):
    # 3 plots here hence 1, 3
    plt.subplot(10, 3, i+1)
    x = nn[col]
    y = nn['price']
    plt.plot(x, y, 'o')
    # Create regression line
    plt.plot(np.unique(x), np.poly1d(np.polyfit(x, y, 1))(np.unique(x)))
    plt.title(col)
    plt.xlabel(col)
    plt.ylabel('prices')
plt.show()



In [ ]:

    
# best fit of data
(mu, sigma) = norm.fit(nn['price'])

# the histogram of the data
n, bins, patches = plt.hist(nn['price'], 60, normed=True, facecolor='green', alpha=0.75)

# add a 'best fit' line
y = mlab.normpdf(bins, mu, sigma)
l = plt.plot(bins, y, 'r--', linewidth=2)

#plot
plt.xlabel('Sales prices')
plt.ylabel('Probability')
plt.title(r'$\mathrm{Histogram\ of\ IQ:}\ \mu=%.3f,\ \sigma=%.3f$' %(mu, sigma))
plt.grid(True)

plt.show()

A bit about correlation



In [ ]:

    
# plot the heatmap
nn = pd.read_csv('kc_house_data.csv')
nn = nn.drop(['id'], axis=1)
plt.figure(figsize=(14, 12))
sns.heatmap(nn.corr())



In [ ]:

    
# showing correlations in the table

cmap = cmap=sns.diverging_palette(5, 250, as_cmap=True)

def magnify():
    return [dict(selector="th",
                 props=[("font-size", "7pt")]),
            dict(selector="td",
                 props=[('padding', "0em 0em")]),
            dict(selector="th:hover",
                 props=[("font-size", "12pt")]),
            dict(selector="tr:hover td:hover",
                 props=[('max-width', '200px'),
                        ('font-size', '12pt')])
]

nn.corr().style.background_gradient(cmap, axis=1)\
    .set_properties(**{'max-width': '80px', 'font-size': '10pt'})\
    .set_caption("Hover to magify")\
    .set_precision(2)\
    .set_table_styles(magnify())



In [ ]:

House Prices - Data fields description

Here's a brief version of what you'll find in the data description file.

SalePrice - the property's sale price in dollars. This is the target variable that you're trying to predict.
MSSubClass: The building class
MSZoning: The general zoning classification
LotFrontage: Linear feet of street connected to property
LotArea: Lot size in square feet
Street: Type of road access
Alley: Type of alley access
LotShape: General shape of property
LandContour: Flatness of the property
Utilities: Type of utilities available
LotConfig: Lot configuration
LandSlope: Slope of property
Neighborhood: Physical locations within Ames city limits
Condition1: Proximity to main road or railroad
Condition2: Proximity to main road or railroad (if a second is present)
BldgType: Type of dwelling
HouseStyle: Style of dwelling
OverallQual: Overall material and finish quality
OverallCond: Overall condition rating
YearBuilt: Original construction date
YearRemodAdd: Remodel date
RoofStyle: Type of roof
RoofMatl: Roof material
Exterior1st: Exterior covering on house
Exterior2nd: Exterior covering on house (if more than one material)
MasVnrType: Masonry veneer type
MasVnrArea: Masonry veneer area in square feet
ExterQual: Exterior material quality
ExterCond: Present condition of the material on the exterior
Foundation: Type of foundation
BsmtQual: Height of the basement
BsmtCond: General condition of the basement
BsmtExposure: Walkout or garden level basement walls
BsmtFinType1: Quality of basement finished area
BsmtFinSF1: Type 1 finished square feet
BsmtFinType2: Quality of second finished area (if present)
BsmtFinSF2: Type 2 finished square feet
BsmtUnfSF: Unfinished square feet of basement area
TotalBsmtSF: Total square feet of basement area
Heating: Type of heating
HeatingQC: Heating quality and condition
CentralAir: Central air conditioning
Electrical: Electrical system
1stFlrSF: First Floor square feet
2ndFlrSF: Second floor square feet
LowQualFinSF: Low quality finished square feet (all floors)
GrLivArea: Above grade (ground) living area square feet
BsmtFullBath: Basement full bathrooms
BsmtHalfBath: Basement half bathrooms
FullBath: Full bathrooms above grade
HalfBath: Half baths above grade
Bedroom: Number of bedrooms above basement level
Kitchen: Number of kitchens
KitchenQual: Kitchen quality
TotRmsAbvGrd: Total rooms above grade (does not include bathrooms)
Functional: Home functionality rating
Fireplaces: Number of fireplaces
FireplaceQu: Fireplace quality
GarageType: Garage location
GarageYrBlt: Year garage was built
GarageFinish: Interior finish of the garage
GarageCars: Size of garage in car capacity
GarageArea: Size of garage in square feet
GarageQual: Garage quality
GarageCond: Garage condition
PavedDrive: Paved driveway
WoodDeckSF: Wood deck area in square feet
OpenPorchSF: Open porch area in square feet
EnclosedPorch: Enclosed porch area in square feet
3SsnPorch: Three season porch area in square feet
ScreenPorch: Screen porch area in square feet
PoolArea: Pool area in square feet
PoolQC: Pool quality
Fence: Fence quality
MiscFeature: Miscellaneous feature not covered in other categories
MiscVal: $Value of miscellaneous feature
MoSold: Month Sold
YrSold: Year Sold
SaleType: Type of sale
SaleCondition: Condition of sale

More about this data set can be found on Kaggle website.



In [ ]:

    
# read csv data
data = pd.read_csv('housing_train.csv')
# describe dataset
data.describe()



In [ ]:

    
# show first 5 records in the dataset

data.head()



In [ ]:

    
# show last 5 records in the dataset

data.tail()



In [239]:

    
# row selection from 10-15 record
dataTemp = data[0:15]



In [ ]:

    
# iteration over rows
for row in dataTemp.iterrows():
    print row[1]['SalePrice']



In [ ]:

    
data['Lambda'] = data['SalePrice'].apply(lambda x: x * 1.1)
dataTemp = data[0:15]
dataTemp[::3]



In [243]:

    
columns = ['SalePrice', 'LotArea', '1stFlrSF', '2ndFlrSF', 'BedroomAbvGr', 'YrSold']
data = data[columns]



In [ ]:

    
plt.figure(figsize=(10, 5))
plt.hist(data['SalePrice'],normed=False)      
plt.show()

plt.figure(figsize=(10, 5))
plt.hist(data['LotArea'],normed=False)      
plt.show()

plt.figure(figsize=(10, 5))
plt.hist(data['BedroomAbvGr'],normed=False)      
plt.show()



In [ ]:

    
len(data['SalePrice'])



In [246]:

    
# Data filtering
dataFiltering = data[['SalePrice', 'BedroomAbvGr','LotArea']].copy()



In [ ]:

    
dataFiltering.head()



In [ ]:

    
# Hadling NaN values
original = pd.read_csv('housing_train.csv')
#original.isnull().any()
original.loc[:, original.isnull().any()]



In [ ]:

    
original.dropna(subset=["LotFrontage"])    # option 1

original.drop("LotFrontage", axis=1)       # option 2

median = housing["LotFrontage"].median()
original["LotFrontage"].fillna(median)     # option 3



In [ ]:

    
# Mention ~ operator
count = original[(original["MSZoning"].str.contains('RL'))]
len(count)



In [ ]:

    
# best fit of data
(mu, sigma) = norm.fit(data['SalePrice'])

# the histogram of the data
n, bins, patches = plt.hist(data['SalePrice'], 60, normed=True, facecolor='green', alpha=0.75)

# add a 'best fit' line
y = mlab.normpdf( bins, mu, sigma)
l = plt.plot(bins, y, 'r--', linewidth=2)

#plot
plt.xlabel('Sales prices')
plt.ylabel('Probability')
plt.title(r'$\mathrm{Histogram\ of\ IQ:}\ \mu=%.3f,\ \sigma=%.3f$' %(mu, sigma))
plt.grid(True)

plt.show()



In [ ]:

    
prices = data['SalePrice']
features = data.drop('SalePrice', axis = 1)

# i: index
for i, col in enumerate(features.columns):
    plt.figure(figsize=(20, 35))
    # 3 plots here hence 1, 3
    plt.subplot(5, 1, i+1)
    x = data[col]
    y = prices
    plt.plot(x, y, 'o')
    # Create regression line
    plt.plot(np.unique(x), np.poly1d(np.polyfit(x, y, 1))(np.unique(x)))
    plt.title(col)
    plt.xlabel(col)
    plt.ylabel('prices')
    plt.show()



In [ ]:

    
foot_to_meter_ratio = 0.092903
data['LotAream2']=data['LotArea'] * foot_to_meter_ratio
data['LotAream2'] = data['LotAream2'].round(0)
data.head()



In [ ]:

    
x = data['LotAream2']
y = prices
plt.figure(figsize=(20, 10))
plt.plot(x, y, 'o')
# Create regression line
plt.plot(np.unique(x), np.poly1d(np.polyfit(x, y, 1))(np.unique(x)))
plt.title(x.name)
plt.xlabel(x.name)
plt.ylabel('prices')
plt.show()



In [ ]:

    
# Creating smaller data set and filter it
dataM2 = data[['SalePrice', 'LotAream2']].copy()
low = .05
high = .9
quant_df = dataM2.quantile([low, high])
print(quant_df)



In [258]:

    
dataM2 = dataM2.apply(lambda x: x[(x > quant_df.loc[low, x.name]) & (x < quant_df.loc[high, x.name])], axis=0)



In [ ]:

    
dataM2.head()



In [ ]:

    
len(dataM2['SalePrice'])



In [260]:

    
dataM2['BedroomAbvGr']=data['BedroomAbvGr'].copy()
dataM2.head()
dataM2 = dataM2.dropna()



In [ ]:

    
x = dataM2['LotAream2']
y = dataM2['SalePrice']
plt.figure(figsize=(20, 15))
plt.plot(x, y, 'o')
# Create regression line
plt.plot(np.unique(x), np.poly1d(np.polyfit(x, y, 1))(np.unique(x)))
plt.title(x.name)
plt.xlabel(x.name)
plt.ylabel('prices')
plt.show()



In [ ]:

    
dataM2["BedroomAbvGr"].value_counts()



In [ ]:

    
color = [str(item*270/255.) for item in dataM2["BedroomAbvGr"]]
figure = plt.figure()
subplot = figure.add_subplot(111)
scatter = subplot.scatter(dataM2['LotAream2'], dataM2['SalePrice'], s=50, c=color)
subplot.set_xlabel('Lot in m2')
subplot.set_ylabel('Price')
plt.colorbar(scatter)
figure.set_figheight(10)
figure.set_figwidth(15)
plt.show()



In [170]:

    
# Correlation matrix
corr_matrix = data.corr()



In [ ]:

    
corr_matrix["SalePrice"].sort_values(ascending=False)



In [ ]:

    
attributes = ["SalePrice", "LotAream2", "BedroomAbvGr", "1stFlrSF", "2ndFlrSF"]
scatter_matrix(data[attributes], figsize=(15, 15)) 
data.plot(kind="scatter", x="LotAream2", y="SalePrice",alpha=0.1) 
plt.show()



In [173]:

    
# a bit more data filtering
df = data[['SalePrice', 'LotAream2', 'BedroomAbvGr']].copy()



In [ ]:

    
df.head()
len(df)



In [264]:

    
filtered = df.drop(
    df.index[(df['LotAream2'] > (df['LotAream2'].mean() + 3 * df['LotAream2'].std()))])



In [ ]:

    
x = filtered['LotAream2']
y = filtered['SalePrice']
plt.figure(figsize=(20, 15))
plt.plot(x, y, 'o')
# Create regression line
plt.plot(np.unique(x), np.poly1d(np.polyfit(x, y, 1))(np.unique(x)))
plt.title(x.name)
plt.xlabel(x.name)
plt.ylabel('prices')
plt.show()



In [182]:

    
filtered.describe()



In [ ]:

    
data.describe()



In [ ]:

    
counts = filtered.groupby('BedroomAbvGr').size()
counts.head()



In [ ]:

    
filtered = df.drop(
    df.index[(df['LotAream2'] > (df['LotAream2'].mean() + 3 * df['LotAream2'].std()))])